Get A and B compartment CpGs
library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(foreach)
library(ggplot2)
library(readr)
# Get CpG locations
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
## Loading required package: minfi
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: bumphunter
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.4 2020-03-24
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
#data(IlluminaHumanMethylation450kanno.ilmn12.hg19)
data(Locations)
cpg_gr <- GRanges(Locations$chr, IRanges(Locations$pos, Locations$pos))
mcols(cpg_gr)$cg <- rownames(Locations)
cpg_gr
## GRanges object with 485512 ranges and 1 metadata column:
## seqnames ranges strand | cg
## <Rle> <IRanges> <Rle> | <character>
## [1] chrY 9363356 * | cg00050873
## [2] chrY 21239348 * | cg00212031
## [3] chrY 8148233 * | cg00213748
## [4] chrY 15815688 * | cg00214611
## [5] chrY 9385539 * | cg00455876
## ... ... ... ... . ...
## [485508] chr22 46114168 * | ch.22.909671F
## [485509] chr22 48451677 * | ch.22.46830341F
## [485510] chr22 48731367 * | ch.22.1008279F
## [485511] chr22 49193714 * | ch.22.47579720R
## [485512] chr22 49888838 * | ch.22.48274842R
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
# Get CGI and colon A/B compartment annotation
a <- import("../input/compartment_A.bed")
b <- import("../input/compartment_B.bed")
cpg_gr$compartment <- NA
cpg_gr$compartment[countOverlaps(cpg_gr, a)>0] <- "A"
cpg_gr$compartment[countOverlaps(cpg_gr, b)>0] <- "B"
table(cpg_gr$compartment)
##
## A B
## 285654 86328
cgi <- readRDS("../input/cpgIslandExt_hg19.rds")
left_shore <- flank(cgi, width=2000, start=TRUE, ignore.strand=TRUE)
right_shore <- flank(cgi, width=2000, start=FALSE, ignore.strand=TRUE)
cgi_shore <- reduce(c(cgi, left_shore, right_shore))
cpg_gr$opensea <- ifelse(countOverlaps(cpg_gr, cgi_shore)>0, FALSE, TRUE)
table(cpg_gr$compartment, cpg_gr$opensea)
##
## FALSE TRUE
## A 172797 112857
## B 33749 52579
Prepare TCGA clinical, methylation and expression data
library(UCSCXenaTools)
## =========================================================================================
## UCSCXenaTools version 1.4.3
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
##
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
## from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
## Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
## --Enjoy it--
##
## Attaching package: 'UCSCXenaTools'
## The following object is masked from 'package:Biobase':
##
## samples
cohort <- "COAD\\."
# Get Clincal data
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>%
XenaFilter(filterDatasets = cohort) %>%
XenaFilter(filterDatasets = "clinicalMatrix") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/COAD_clinicalMatrix
clin = XenaPrepare(xe_download)
# Get 450k DNAm matrix
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>%
XenaFilter(filterDatasets = cohort) %>%
XenaFilter(filterDatasets = "HumanMethylation450") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/HumanMethylation450.gz
dnam_df = XenaPrepare(xe_download)
rn <- dnam_df$sample
dnam <- dnam_df %>% select(-1) %>% as.matrix()
rownames(dnam) <- rn
# Get gene expression matrix
XenaGenerate(subset = XenaHostNames=="tcgaHub") %>%
XenaFilter(filterDatasets = cohort) %>%
XenaFilter(filterDatasets = "HiSeqV2_PANCAN") -> to_get
XenaQuery(to_get) %>% XenaDownload() -> xe_download
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/zr/t7qsz9v56gv3lh8r8bmbprv00000gn/T//RtmpfVHNKe.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA.COAD.sampleMap/HiSeqV2_PANCAN.gz
expr_df <- XenaPrepare(xe_download)
rn <- expr_df$sample
expr <- expr_df %>% select(-1) %>% as.matrix()
rownames(expr) <- rn
Look correlation between hypomethylation and A/B gene expression difference
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(org.Hs.eg.db)
##
entrez_id <- mapIds(org.Hs.eg.db, rownames(expr), "ENTREZID", "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
symbol <- unlist(mapIds(org.Hs.eg.db, entrez_id, "SYMBOL", "ENTREZID"))
## 'select()' returned 1:1 mapping between keys and columns
gene_pos <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,
keys = entrez_id[rownames(expr)],
columns=c("TXNAME", "TXCHROM", "TXSTART"),
keytype="GENEID")
## 'select()' returned many:many mapping between keys and columns
gene_pos <- gene_pos %>% filter(!is.na(TXCHROM) & !is.na(TXSTART))
gene_pos$symbol <- symbol[gene_pos$GENEID]
gene_pos <- gene_pos[!duplicated(gene_pos$symbol),]
gene_tss_gr <- GRanges(gene_pos$TXCHROM, IRanges(gene_pos$TXSTART, gene_pos$TXSTART))
mcols(gene_tss_gr) <- gene_pos[, c("GENEID", "symbol")]
gene_tss_gr$compartment <- NA
gene_tss_gr$compartment[countOverlaps(gene_tss_gr, a)>0] <- "A"
gene_tss_gr$compartment[countOverlaps(gene_tss_gr, b)>0] <- "B"
a_idx <- which(rownames(expr) %in% gene_tss_gr$symbol[gene_tss_gr$compartment=="A"])
b_idx <- which(rownames(expr) %in% gene_tss_gr$symbol[gene_tss_gr$compartment=="B"])
expr_a <- colMeans(expr[a_idx,], na.rm=TRUE)
expr_b <- colMeans(expr[b_idx,], na.rm=TRUE)
plot(expr_a, expr_b)

plot(hypometh, expr_a)

plot(hypometh, expr_b)

plot(hypometh, expr_a-expr_b)

write_tsv(data.frame(symbol=rownames(expr)[a_idx]), file="../coad_a_genes.txt", col_names=FALSE)
write_tsv(data.frame(symbol=rownames(expr)[b_idx]), file="../coad_b_genes.txt", col_names=FALSE)